NK/T-cell compartment

In this notebook, we will

  • extract the NK/T-cell compartment
  • Perform unsupervised clustering using the Leiden-algorithm
  • Annotate more fine-grained cell-types
  • Perform sub-clustering of the cell-types

Input data and configuration

# get default parameters the papermill way.
            input_file = "../results/04_annotate_cell_types/adata.h5ad"
            output_file = "tmp/data.h5ad"
            output_file_obs = "tmp/adata_obs.tsv"
            output_file_norm_counts = "tmp/norm_counts.tsv"
            results_dir = "tmp"
            table_dir = "../tables"
# Parameters
            input_file = "input_adata.h5ad"
            output_file = "adata.h5ad"
            output_file_obs = "adata_obs.tsv"
            output_file_norm_counts = "norm_counts.tsv"
            cpus = "1"
            table_dir = "tables"
            results_dir = "."
import pandas as pd
            import scanpy as sc
            import numpy as np
            from matplotlib import pyplot as plt
            import sys
            import os
            
            sys.path.append("lib")
            sys.path.append("../lib")
            
            from jupytertools import fix_logging, display
            
            from operator import or_
            from functools import reduce
            
            fix_logging(sc.settings)
markers = pd.read_csv(os.path.join(table_dir, "cell_type_markers.csv"))
cell_types = markers["cell_type"].unique()
adata = sc.read_h5ad(input_file)
sc.pl.umap(adata, color="cell_type")

# PCA turned out not to be entirely reproducible on different CPU architechtures.
            # For the sake of reproducibility of these notebooks, we load a pre-computed result
            # from the repository. If it doesn't exist, we compute it from scratch.
            def pca_precomputed(adata, key, **kwargs):
                pca_file = os.path.join(table_dir, f"adata_pca_{key}.pkl.gz")
                try:
                    # CSV does not preserver floating point numbers perfectly. 
                    adata.obsm["X_pca"] = pd.read_pickle(pca_file).values
                except IOError:
                    assert False, "should use pre-computed version. "
                    sc.tl.pca(adata, svd_solver="arpack", **kwargs)
                    pd.DataFrame(adata.obsm["X_pca"]).to_pickle(pca_file)

Extract NK and T-cells

adata = adata[adata.obs["cell_type"].isin(["T cell", "NK cell"]), :].copy()
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if not is_categorical(df_full[k]):
            

Exclude TCR genes

As in a preliminary analysis, they turned out to be dominant DE genes when comparing clusters. In the transcriptomics analysis, we are not interested in the effects of T-cell receptor genes.

prefixes = ["TRAV", "TRBV", "TRBJ", "TRAJ", "TRBD"]
            exclude_genes = reduce(or_, [adata.var_names.str.startswith(x) for x in prefixes])
adata.var_names[exclude_genes]
Index(['TRBV1', 'TRBV2', 'TRBV3-1', 'TRBV4-1', 'TRBV5-1', 'TRBV6-1', 'TRBV4-2',
                   'TRBV6-2', 'TRBV7-2', 'TRBV6-4', 'TRBV7-3', 'TRBV5-3', 'TRBV9',
                   'TRBV10-1', 'TRBV11-1', 'TRBV10-2', 'TRBV11-2', 'TRBV6-5', 'TRBV7-4',
                   'TRBV5-4', 'TRBV6-6', 'TRBV5-5', 'TRBV7-6', 'TRBV5-6', 'TRBV7-7',
                   'TRBV7-9', 'TRBV13', 'TRBV10-3', 'TRBV11-3', 'TRBV12-3', 'TRBV12-4',
                   'TRBV12-5', 'TRBV14', 'TRBV15', 'TRBV16', 'TRBV18', 'TRBV19',
                   'TRBV20-1', 'TRBV21-1', 'TRBV23-1', 'TRBV24-1', 'TRBV25-1', 'TRBV27',
                   'TRBV28', 'TRBV29-1', 'TRBV30', 'TRAV1-1', 'TRAV1-2', 'TRAV2', 'TRAV3',
                   'TRAV4', 'TRAV5', 'TRAV6', 'TRAV8-1', 'TRAV10', 'TRAV12-1', 'TRAV8-2',
                   'TRAV8-3', 'TRAV13-1', 'TRAV12-2', 'TRAV8-4', 'TRAV13-2', 'TRAV14DV4',
                   'TRAV9-2', 'TRAV12-3', 'TRAV8-6', 'TRAV16', 'TRAV17', 'TRAV18',
                   'TRAV19', 'TRAV20', 'TRAV21', 'TRAV22', 'TRAV23DV6', 'TRAV24', 'TRAV25',
                   'TRAV26-1', 'TRAV27', 'TRAV29DV5', 'TRAV30', 'TRAV26-2', 'TRAV34',
                   'TRAV35', 'TRAV36DV7', 'TRAV38-1', 'TRAV38-2DV8', 'TRAV39', 'TRAV40',
                   'TRAV41'],
                  dtype='object')
adata.shape
(17062, 16818)
adata = adata[:, ~exclude_genes].copy()
adata.shape
(17062, 16729)
np.sum(adata.var_names.str.startswith("TRAV"))
0

Cell-type annotation

random_state = 1
            sc.pp.neighbors(adata, random_state=random_state)
            sc.tl.umap(adata, random_state=random_state)
computing neighbors
                using 'X_pca' with n_pcs = 50
            
    finished
            computing UMAP
            
    finished
            
sc.tl.leiden(adata, resolution=3, random_state=42)
running Leiden clustering
            
    finished
            
fig, ax = plt.subplots(figsize=(7, 5))
            sc.pl.umap(
                adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
            )

for ct in cell_types:
                if not "T cell" in ct and ct != "NK cell":
                    continue
                marker_genes = markers.loc[markers["cell_type"] == ct, "gene_identifier"]
                sc.pl.umap(
                    adata,
                    color=marker_genes,
                    title=["{}: {}".format(ct, g) for g in marker_genes],
                    size=15,
                )

fig, ax = plt.subplots(figsize=(7, 5))
            sc.pl.umap(
                adata, legend_loc="on data", color="leiden", ax=ax, size=20, legend_fontoutline=3
            )

annotation_t = {
                "T CD4+": [],
                # 27 is NK, but some of the cells are CD8
                "T CD8+": [8, 0, 6, 20, 11, 29, 21, 22, 14, 27],
                "T reg.": [13, 12],
                "T other": [31],
                "ambiguous": [24, 25, 26, 18],
            }
annot_dict = {str(c): ct for ct, clusters in annotation_t.items() for c in clusters}
            adata.obs["cell_type2"] = [
                annot_dict.get(c, "T CD4+") if ct != "NK cell" else ct
                for c, ct in zip(adata.obs["leiden"], adata.obs["cell_type"])
            ]
sc.pl.umap(adata, color="cell_type2")
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if is_string_dtype(df[key]) and not is_categorical(df[key])
            
... storing 'cell_type2' as categorical
            

re-annotate ambiguous clusters

adata_ambiguous = adata[adata.obs["cell_type2"] == "ambiguous", :].copy()
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if not is_categorical(df_full[k]):
            
sc.__version__
'1.6.0'
pca_precomputed(adata_ambiguous, "subcluster_ambiguous")
            sc.pp.neighbors(adata_ambiguous, n_neighbors=5)
            sc.tl.umap(adata_ambiguous)
computing neighbors
                using 'X_pca' with n_pcs = 50
                finished
            computing UMAP
            
    finished
            
sc.tl.leiden(adata_ambiguous, resolution=2)
running Leiden clustering
                finished
            
sc.pl.umap(
                adata_ambiguous, color=["leiden"], legend_loc="on data", legend_fontoutline=4
            )
            sc.pl.umap(adata_ambiguous, color=["CD8A", "CD8B", "FOXP3", "CD4", "n_counts"], ncols=3)

annotation_ambiguous = {
                "T CD4+": [21, 14, 12, 2, 23, 11, 13, 8, 9, 17, 24, 0, 22, 10],
                "T CD8+": [4, 18, 15, 26],
                "T reg.": [6, 1, 5, 19, 25, 3, 16, 7],
                "T other": [20],
            }
annot_dict = {
                str(c): ct for ct, clusters in annotation_ambiguous.items() for c in clusters
            }
            adata_ambiguous.obs["cell_type3"] = [
                annot_dict[c]
                for c, ct in zip(adata_ambiguous.obs["leiden"], adata_ambiguous.obs["cell_type"])
            ]
sc.pl.umap(adata_ambiguous, color="cell_type3")
... storing 'cell_type3' as categorical
            
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if is_string_dtype(df[key]) and not is_categorical(df[key])
            

integrate annotation into main object

# convert categorical to string
            adata.obs["cell_type2"] = [str(x) for x in adata.obs["cell_type2"]]
            adata.obs.loc[adata_ambiguous.obs_names, "cell_type2"] = adata_ambiguous.obs[
                "cell_type3"
            ]
adata.obs["cell_type"] = adata.obs["cell_type2"]
            del adata.obs["cell_type2"]
sc.pl.umap(
                adata,
                color=[
                    "samples",
                    "n_genes",
                    "mt_frac",
                    "has_ir",
                    "hpv_status",
                    "ir_status",
                    "chain_pairing",
                    "phase",
                    "leiden",
                    "CD4",
                    "FOXP3",
                    "CD8A",
                    "CD19",
                    "is_doublet",
                    "cell_type",
                ],
                ncols=3,
            )
... storing 'cell_type' as categorical
            

Perform sub-clustering for all cell types individually

adatas = dict()
            resolutions = {"T CD4+": 1, "T CD8+": 1, "NK cell": 0.5, "T reg.": 0.5, "T other": 0.5}
            for ct in adata.obs["cell_type"].unique():
                tmp_adata = adata[adata.obs["cell_type"] == ct, :].copy()
                pca_precomputed(
                    tmp_adata,
                    f'subcluster_{ct.lower().replace(" ", "_").replace("+", "").replace(".", "")}',
                )
                sc.pp.neighbors(tmp_adata)
                sc.tl.umap(tmp_adata)
                sc.tl.leiden(tmp_adata, resolution=resolutions[ct])
                sc.pl.umap(tmp_adata, color="leiden", title=ct)
                adatas[ct] = tmp_adata
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if not is_categorical(df_full[k]):
            
computing neighbors
                using 'X_pca' with n_pcs = 50
            
    finished
            computing UMAP
            
    finished
            running Leiden clustering
            
    finished
            

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if not is_categorical(df_full[k]):
            
computing neighbors
                using 'X_pca' with n_pcs = 50
            
    finished
            computing UMAP
            
    finished
            running Leiden clustering
            
    finished
            

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if not is_categorical(df_full[k]):
            
computing neighbors
                using 'X_pca' with n_pcs = 50
                finished
            computing UMAP
            
    finished
            running Leiden clustering
                finished
            

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if not is_categorical(df_full[k]):
            
computing neighbors
                using 'X_pca' with n_pcs = 50
            
    finished
            computing UMAP
            
    finished
            running Leiden clustering
                finished
            

/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1094: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if not is_categorical(df_full[k]):
            
computing neighbors
                using 'X_pca' with n_pcs = 50
                finished
            computing UMAP
            
    finished
            running Leiden clustering
                finished
            

Store results

for ct, tmp_adata in adatas.items():
                adata.obs.loc[tmp_adata.obs.index, "cluster"] = [
                    "{} {}".format(ct, x) for x in tmp_adata.obs["leiden"]
                ]
sc.pl.umap(adata, color=["cluster", "cell_type"])
/opt/conda/lib/python3.8/site-packages/anndata/_core/anndata.py:1192: FutureWarning: is_categorical is deprecated and will be removed in a future version.  Use is_categorical_dtype instead
              if is_string_dtype(df[key]) and not is_categorical(df[key])
            
... storing 'cluster' as categorical
            

adata.write_h5ad(output_file, compression="lzf")
            adata.obs.to_csv(output_file_obs, sep="\t")